In [2]:
#Time series project on big data 
In [3]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
In [4]:
#loading few particular csv files from the dataset 
path = r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr'
company_list=['AAPL_data.csv','GOOG_data.csv','MSFT_data.csv','AMZN_data.csv']
all_data = pd.DataFrame()
for file in company_list:
    current_df = pd.read_csv(path+'/'+file)
    all_data = pd.concat([all_data,current_df])
all_data.shape
Out[4]:
(4752, 7)
In [5]:
all_data.head()
Out[5]:
date open high low close volume Name
0 2013-02-08 67.7142 68.4014 66.8928 67.8542 158168416 AAPL
1 2013-02-11 68.0714 69.2771 67.6071 68.5614 129029425 AAPL
2 2013-02-12 68.5014 68.9114 66.8205 66.8428 151829363 AAPL
3 2013-02-13 66.7442 67.6628 66.1742 66.7156 118721995 AAPL
4 2013-02-14 66.3599 67.3771 66.2885 66.6556 88809154 AAPL
In [6]:
all_data.dtypes
Out[6]:
date       object
open      float64
high      float64
low       float64
close     float64
volume      int64
Name       object
dtype: object
In [7]:
all_data['date'] = pd.to_datetime(all_data['date'])
In [8]:
all_data.dtypes
Out[8]:
date      datetime64[ns]
open             float64
high             float64
low              float64
close            float64
volume             int64
Name              object
dtype: object
In [9]:
#analysing the closing time of the stocks of the major stock holders as taken above. 
tech_list = all_data['Name'].unique()
In [10]:
#iterating over the tech list 
plt.figure(figsize=(10,7))
for i,company in enumerate(tech_list,1):
    plt.subplot(2,2,i)
    df = all_data[all_data['Name']==company]
    plt.plot(df['date'],df['close'])
    plt.xticks(rotation='vertical')
    plt.title(company)

#to view the dates in better way, have to change/convert the string to date format. 
In [11]:
#analysing the total volume of stock traded each day 
import plotly.express as px 
In [12]:
#iterating the list 
for company in tech_list:
    df = all_data[all_data['Name']==company]
    fig= px.line(df,x='date',y='volume',title=company)
    fig.show()

#now to zoom in or check a particular data, use the plotly function or feature to cut out that part. 
In [14]:
df.head()
Out[14]:
date open high low close volume Name
0 2013-02-08 261.40 265.25 260.555 261.95 3879078 AMZN
1 2013-02-11 263.20 263.25 256.600 257.21 3403403 AMZN
2 2013-02-12 259.19 260.16 257.000 258.70 2938660 AMZN
3 2013-02-13 261.53 269.96 260.300 269.47 5292996 AMZN
4 2013-02-14 267.37 270.65 265.400 269.24 3462780 AMZN
In [16]:
#analysing daily rise in price in only aapl dataset 
df1 = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\AAPL_data.csv')
df1.head()
Out[16]:
date open high low close volume Name
0 2013-02-08 67.7142 68.4014 66.8928 67.8542 158168416 AAPL
1 2013-02-11 68.0714 69.2771 67.6071 68.5614 129029425 AAPL
2 2013-02-12 68.5014 68.9114 66.8205 66.8428 151829363 AAPL
3 2013-02-13 66.7442 67.6628 66.1742 66.7156 118721995 AAPL
4 2013-02-14 66.3599 67.3771 66.2885 66.6556 88809154 AAPL
In [17]:
df.head()
Out[17]:
date open high low close volume Name
0 2013-02-08 261.40 265.25 260.555 261.95 3879078 AMZN
1 2013-02-11 263.20 263.25 256.600 257.21 3403403 AMZN
2 2013-02-12 259.19 260.16 257.000 258.70 2938660 AMZN
3 2013-02-13 261.53 269.96 260.300 269.47 5292996 AMZN
4 2013-02-14 267.37 270.65 265.400 269.24 3462780 AMZN
In [18]:
df1['Daily_price_change'] = df1['close'] - df1['open'] 
In [19]:
df1['1day%return'] = ((df1['close'] - df1['open'])/df1['open'])*100 
In [20]:
df1.head()
Out[20]:
date open high low close volume Name Daily_price_change 1day%return
0 2013-02-08 67.7142 68.4014 66.8928 67.8542 158168416 AAPL 0.1400 0.206751
1 2013-02-11 68.0714 69.2771 67.6071 68.5614 129029425 AAPL 0.4900 0.719832
2 2013-02-12 68.5014 68.9114 66.8205 66.8428 151829363 AAPL -1.6586 -2.421264
3 2013-02-13 66.7442 67.6628 66.1742 66.7156 118721995 AAPL -0.0286 -0.042850
4 2013-02-14 66.3599 67.3771 66.2885 66.6556 88809154 AAPL 0.2957 0.445600
In [21]:
fig = px.line(df1,x='date',y='1day%return',title=company)
fig.show()
In [22]:
#analysing monthly mean of the close data 
df2=df1.copy()
In [23]:
df2.dtypes
Out[23]:
date                   object
open                  float64
high                  float64
low                   float64
close                 float64
volume                  int64
Name                   object
Daily_price_change    float64
1day%return           float64
dtype: object
In [24]:
df2['date'] = pd.to_datetime(df2['date'])
In [25]:
df2.set_index('date',inplace=True)
In [26]:
df2.head()
Out[26]:
open high low close volume Name Daily_price_change 1day%return
date
2013-02-08 67.7142 68.4014 66.8928 67.8542 158168416 AAPL 0.1400 0.206751
2013-02-11 68.0714 69.2771 67.6071 68.5614 129029425 AAPL 0.4900 0.719832
2013-02-12 68.5014 68.9114 66.8205 66.8428 151829363 AAPL -1.6586 -2.421264
2013-02-13 66.7442 67.6628 66.1742 66.7156 118721995 AAPL -0.0286 -0.042850
2013-02-14 66.3599 67.3771 66.2885 66.6556 88809154 AAPL 0.2957 0.445600
In [27]:
#grabbing data between any two dates 
df2['2013-02-08':'2013-02-13']
Out[27]:
open high low close volume Name Daily_price_change 1day%return
date
2013-02-08 67.7142 68.4014 66.8928 67.8542 158168416 AAPL 0.1400 0.206751
2013-02-11 68.0714 69.2771 67.6071 68.5614 129029425 AAPL 0.4900 0.719832
2013-02-12 68.5014 68.9114 66.8205 66.8428 151829363 AAPL -1.6586 -2.421264
2013-02-13 66.7442 67.6628 66.1742 66.7156 118721995 AAPL -0.0286 -0.042850
In [28]:
#resamplping by month as monthly mean analysis is being done 
df2['close'].resample('M').mean().plot(kind='bar')
Out[28]:
<AxesSubplot:xlabel='date'>
In [29]:
#for year 
df2['close'].resample('Y').mean().plot(kind='bar')
Out[29]:
<AxesSubplot:xlabel='date'>
In [31]:
#analysing the correlation of diff companies stock prices 
aapl = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\AAPL_data.csv')
aapl.head()
Out[31]:
date open high low close volume Name
0 2013-02-08 67.7142 68.4014 66.8928 67.8542 158168416 AAPL
1 2013-02-11 68.0714 69.2771 67.6071 68.5614 129029425 AAPL
2 2013-02-12 68.5014 68.9114 66.8205 66.8428 151829363 AAPL
3 2013-02-13 66.7442 67.6628 66.1742 66.7156 118721995 AAPL
4 2013-02-14 66.3599 67.3771 66.2885 66.6556 88809154 AAPL
In [33]:
amzn = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\AMZN_data.csv')
amzn.head()
Out[33]:
date open high low close volume Name
0 2013-02-08 261.40 265.25 260.555 261.95 3879078 AMZN
1 2013-02-11 263.20 263.25 256.600 257.21 3403403 AMZN
2 2013-02-12 259.19 260.16 257.000 258.70 2938660 AMZN
3 2013-02-13 261.53 269.96 260.300 269.47 5292996 AMZN
4 2013-02-14 267.37 270.65 265.400 269.24 3462780 AMZN
In [34]:
msft = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\MSFT_data.csv')
msft.head()
Out[34]:
date open high low close volume Name
0 2013-02-08 27.35 27.71 27.31 27.55 33318306 MSFT
1 2013-02-11 27.65 27.92 27.50 27.86 32247549 MSFT
2 2013-02-12 27.88 28.00 27.75 27.88 35990829 MSFT
3 2013-02-13 27.93 28.11 27.88 28.03 41715530 MSFT
4 2013-02-14 27.92 28.06 27.87 28.04 32663174 MSFT
In [35]:
goog = pd.read_csv(r'E:\Placement 2022-23\Analytics\projects\2-Time Series Data Analysis-20220907T085235Z-001\2-Time Series Data Analysis\individual_stocks_5yr\GOOG_data.csv')
goog.head()
Out[35]:
date open high low close volume Name
0 2014-03-27 568.000 568.00 552.92 558.46 13052 GOOG
1 2014-03-28 561.200 566.43 558.67 559.99 41003 GOOG
2 2014-03-31 566.890 567.00 556.93 556.97 10772 GOOG
3 2014-04-01 558.710 568.45 558.71 567.16 7932 GOOG
4 2014-04-02 565.106 604.83 562.19 567.00 146697 GOOG
In [36]:
close = pd.DataFrame()
In [37]:
close['aapl'] = aapl['close']
close['amzn'] = amzn['close'] 
close['msft'] = msft['close']
close['goog'] = goog['close']
In [38]:
close.head()
Out[38]:
aapl amzn msft goog
0 67.8542 261.95 27.55 558.46
1 68.5614 257.21 27.86 559.99
2 66.8428 258.70 27.88 556.97
3 66.7156 269.47 28.03 567.16
4 66.6556 269.24 28.04 567.00
In [39]:
import seaborn as sns
In [40]:
sns.pairplot(data=close)
Out[40]:
<seaborn.axisgrid.PairGrid at 0x19d10493760>
In [42]:
sns.heatmap(close.corr(),annot=True)
#conclusion is amzn and msft are most corr and goog and aapl are least. 
Out[42]:
<AxesSubplot:>
In [43]:
#analysing the daily return of stocks and their correlation 
data = pd.DataFrame()
In [44]:
data['aapl_change'] = ((aapl['close'] - aapl['open'])/aapl['close'])*100
data['amzn_change'] = ((amzn['close'] - amzn['open'])/amzn['close'])*100
data['msft_change'] = ((msft['close'] - msft['open'])/msft['close'])*100
data['goog_change'] = ((goog['close'] - goog['open'])/goog['close'])*100
data.head()
Out[44]:
aapl_change amzn_change msft_change goog_change
0 0.206325 0.209964 0.725953 -1.708269
1 0.714688 -2.328836 0.753769 -0.216075
2 -2.481344 -0.189409 0.000000 -1.781065
3 -0.042869 2.946525 0.356761 1.489879
4 0.443624 0.694548 0.427960 0.334039
In [45]:
sns.pairplot(data=data)
Out[45]:
<seaborn.axisgrid.PairGrid at 0x19d13155b20>
In [46]:
sns.heatmap(data.corr(),annot=True)
Out[46]:
<AxesSubplot:>
In [47]:
#value at risk analysis for different tech companies 
sns.distplot(data['aapl_change'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

Out[47]:
<AxesSubplot:xlabel='aapl_change', ylabel='Density'>
In [48]:
#finding standard deviation 
data['aapl_change'].std
#approx 68% data 
Out[48]:
<bound method NDFrame._add_numeric_operations.<locals>.std of 0       0.206325
1       0.714688
2      -2.481344
3      -0.042869
4       0.443624
          ...   
1254    0.366551
1255   -3.426791
1256   -1.667838
1257    5.029749
1258   -2.222013
Name: aapl_change, Length: 1259, dtype: float64>
In [49]:
data['aapl_change'].std()*2
#approx 95% data 
Out[49]:
2.3742754262842474
In [50]:
data['aapl_change'].std()*3
#approx 98% of entire data 
Out[50]:
3.561413139426371
In [51]:
data['aapl_change'].quantile(0.1)
#says that 90% of the time, worst daily loss wont exceed this value. 
Out[51]:
-1.4246644227944307
In [53]:
data.describe().T
Out[53]:
count mean std min 25% 50% 75% max
aapl_change 1259.0 -0.000215 1.187138 -7.104299 -0.658021 0.042230 0.715427 8.000388
amzn_change 1259.0 -0.000398 1.358679 -9.363077 -0.738341 -0.002623 0.852568 5.640265
msft_change 1259.0 0.076404 1.059260 -5.177618 -0.509241 0.061069 0.703264 4.861491
goog_change 975.0 -0.012495 1.092560 -5.952266 -0.551963 0.024951 0.672649 4.943550